
#include <stdio.h>


#define N 100


int main () {
    int i, j, size;
    double * a;
    double * b;

    a = malloc(N * N * sizeof(double));
    size = N;

    /* This should produce a kernel grid of 32x32   */
    #pragma acc parallel num_gangs(1)  num_workers(1)
    {
            #pragma acc loop gang(32) worker(32)
            for (i = 0; i < size; i++) {   
                for (j = 0; j < size; j++) {
                    a[i][j] = i*size+j;
                }
            }
    }

    free(a);
}
